package org.jeecg.modules.webcrawler.job;

import java.io.IOException;
import java.util.Date;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.jeecg.common.util.DateUtils;
import org.jeecg.common.util.RedisUtil;
import org.jeecg.modules.webcrawler.entity.WebCrawlerWord;
import org.jeecg.modules.webcrawler.util.WebCrawlerCacheUtils;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomFilterHelper;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomRedisService;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.beans.factory.annotation.Autowired;

import lombok.extern.slf4j.Slf4j;

/**
 * 邯郸新闻网
 * 
 * @author Scott
 */
@Slf4j
public class HandanNewsJob implements Job {
	
	@Autowired
    private BloomRedisService redisService;

    @Autowired
    private BloomFilterHelper bloomFilterHelper;
	
	@Override
	public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException {
		log.info(String.format(" 邯郸新闻网 !  时间:" + DateUtils.getTimestamp()));
		List<WebCrawlerWord> wordList = WebCrawlerCacheUtils.queryWordList();
		
		get_list("马占山", 1, 10);
		get_list("马占山 会议", 1, 11);
		get_list("马占山 讲话", 1, 12);
		get_list("马占山 调研", 1, 13);
		
		for (WebCrawlerWord word : wordList) {
			System.out.println(word.getTitle());
			if(word.getSort() == 1) {		//公安
				get_list(word.getTitle(), 1, 2);
			}else if(word.getSort() == 2) {//经开区
				get_list(word.getTitle(), 1, 102);
			}else if(word.getSort() == 3) {//经开区
				get_list(word.getTitle(), 1, 302);
			}
			
			System.out.println("-------------------------------------");
		}
	}
	
	
	/**
	 * 邯郸新闻网
	 */
	public  void get_list(String keyword,int page,int type){
		//整个html内容
		Document doc;
		int errcount = 0; //重复次数
		try {
			//Thread.sleep(10000);	//
			Connection conn = Jsoup.connect("http://www.handannews.com.cn:9088/servlet/SearchServlet.do?contentKey="+keyword+"&titleKey=&authorKey=&nodeNameResult=&subNodeResult=&dateFrom=&dateEnd=&sort=&op=single&siteID=&pager.offset="+(page-1)*10+"&pageNo="+page).timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			doc = conn.get();

			String name = doc.getElementsByTag("title").text();
			log.info("***************************************"+keyword+"*****第"+page+"页("+name+")********************************************");
			Elements tablelist = doc.select("div#result_list table");
			if(!tablelist.isEmpty()) {
				Elements list = tablelist.get(1).getElementsByTag("td");
				for (Element info : list) {
					
					String title = info.select("a").first().text();
					String url = info.select("a").first().attr("href");
					String time = getTime(url);
					
					if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
						if(redisService.includeByBloomFilter(bloomFilterHelper, "www.handannews.com.cn"+type, url)){  //url已存在
							errcount++;
						}else {
							redisService.addByBloomFilter(bloomFilterHelper, "www.handannews.com.cn"+type, url);
							if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
								errcount++;
								break;
							}else {
								Date date = DateUtils.stringToDate(time, "yyyy-MM-dd HH:mm");
								WebCrawlerCacheUtils.addArticle(title, url, date, type, "邯郸新闻网");
							}
						}
					}
					log.info(title);
					log.info(url);
					log.info(time);
					log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
				}
			}
			
			//查询分页列表
			page++;
			if(!tablelist.isEmpty() && errcount < 8) {
				//get_list(keyword, page, type);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
	}
	

	
	public  String getTime(String url){
		//整个html内容
		Document doc;
		boolean flag = true;
		try {
			Connection conn = Jsoup.connect(url).timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			doc = conn.get();
			//打印html文档的<title>内容
			String time = doc.select("div.date-source span").first().text();
			System.out.println(time);
			return time;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
		return null;
	}
	
	
	public static void main(String[] args) {
//		String[] keywords = {"邯郸民警","破获","扫黑除恶","治安拘留","黄赌毒","邯郸警方","邯郸公安"};
//		for (String keyword : keywords) {
//			log.info(keyword);
//			new HandanNewsJob().get_list(keyword, 1);
//		}
		new HandanNewsJob().get_list("扫黑除恶", 1, 2);
	}
}
